# from sklearn.feature_extraction.text import CountVectorizer
# import jieba
#
# sentence = ['苹果苹果','葡萄苹果西瓜','葡萄']
# word_list = []
# for s in sentence:
#     words = jieba.cut(s,cut_all=False)
#     word_list.append(words)
#
# print(word_list)
#
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(word_list)
# print(vectorizer.get_feature_names_out())
# print(X.toarray())

from sklearn.feature_extraction.text import CountVectorizer
sentence = ['苹果苹果','葡萄苹果西瓜','葡萄']
# 示例文本数据
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?'
]

# 创建 CountVectorizer 对象
vectorizer = CountVectorizer()

# 使用 fit_transform 方法将文本数据转换为词频矩阵
X = vectorizer.fit_transform(sentence)

# 获取特征名称
feature_names = vectorizer.get_feature_names_out()

# 打印特征名称
print(feature_names)

# 打印词频矩阵
print(X.toarray())